Stock Market Prediction

In [1]:
import math,random
import quandl
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,SGDRegressor,BayesianRidge,ARDRegression,PassiveAggressiveRegressor,TheilSenRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,StackingRegressor,VotingRegressor
from sklearn.neural_network import MLPRegressor
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
In [2]:
stock = "MSFT"
daysToForecast = 251
In [3]:
def getStockData(stock):
    quandl.ApiConfig.api_key = "qWcicxSctVxrP9PhyneG"
    allData = quandl.get('WIKI/'+stock)
    return allData
In [4]:
def FormatDataForModel(dataArray):
    dataArray = dataArray[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
    dataArray['HL_PCT'] = (dataArray['Adj. High'] - dataArray['Adj. Close']) / dataArray['Adj. Close'] * 100.0
    dataArray['PCT_change'] = (dataArray['Adj. Close'] - dataArray['Adj. Open']) / dataArray['Adj. Open'] * 100.0
    dataArray = dataArray[['Adj. Close', 'HL_PCT', 'PCT_change','Adj. Volume']]
    dataArray.fillna(-99999, inplace=True)
    return dataArray
In [5]:
def PreprocessData(mlData,daysToForecast):
    forecast_col = 'Adj. Close'
    forecast_out = int(math.ceil(0.12*daysToForecast))
    mlData['label'] = mlData[forecast_col].shift(-forecast_out)
    #mlData.dropna(inplace=True)
    X = np.array(mlData.drop(['label'],1))
    X = preprocessing.scale(X)
    X_data = X[-daysToForecast:]
    X = X[:-daysToForecast]
    forecastData = mlData[-daysToForecast:]
    trainData= mlData[:-daysToForecast]
    y = np.array(trainData['label'])
    response = [X,y,X_data,forecastData]
    return response
In [6]:
def TrainAndPredict(model,X,y,X_data):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    prediction = model.predict(X_data)
    return accuracy, prediction
In [7]:
def addPredictionToForecast(prediction,forecastData):
    forecastData = forecastData[['Adj. Close']]
    forecastData = forecastData.rename(columns={'Adj. Close':'EOD'})
    forecastData['prediction'] = prediction[:]
    return forecastData
In [8]:
def GraphPredictions(forecastData,stock):
    fig = px.line(forecastData)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()
In [9]:
def GraphAllData(allData,forecastData,stock):
    result = pd.concat([allData['Adj. Close'],forecastData['prediction']],axis =1, sort=False)
    fig = px.line(result)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()
In [10]:
allData = getStockData(stock)
mlData = FormatDataForModel(allData)
X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
model = LinearRegression()
accuracy,prediction=TrainAndPredict(model,X,y,X_data)
forecastData = addPredictionToForecast(prediction,forecastData)
In [11]:
print(accuracy)
0.9821885840976519
In [12]:
GraphPredictions(forecastData,stock)
In [13]:
GraphAllData(allData,forecastData,stock)
In [14]:
stock_list = ['AAPL', 'IBM', 'MSFT', 'WMT','AMZN','TSLA','PLUG','GOOGL','FB','CRM']
In [15]:
for stock in stock_list:
    print("Stock: ", stock)
    allData = getStockData(stock)
    mlData = FormatDataForModel(allData)
    X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
    model = LinearRegression()
    accuracy,prediction=TrainAndPredict(model,X,y,X_data)
    print("Accuracy: ", accuracy)
    forecastData = addPredictionToForecast(prediction,forecastData)
    GraphPredictions(forecastData,stock)
    GraphAllData(allData,forecastData,stock)
Stock:  AAPL
Accuracy:  0.9890691625580129
Stock:  IBM
Accuracy:  0.9900969449212501
Stock:  MSFT
Accuracy:  0.9786975475748013
Stock:  WMT
Accuracy:  0.9921003456967479
Stock:  AMZN
Accuracy:  0.9846456838790258
Stock:  TSLA
Accuracy:  0.9170350574165489
Stock:  PLUG
Accuracy:  0.5892379906640812
Stock:  GOOGL
Accuracy:  0.9700025142206737
Stock:  FB
Accuracy:  0.9621041912934818
Stock:  CRM
Accuracy:  0.97246946267249
In [16]:
model_list = [[LinearRegression(), "LinearRegression"],
              [SVR(),"SupportVectorRegression"],
              [MLPRegressor(),"MLPRegressor"],
              [SGDRegressor(),"SGDRegressor"],
              [BayesianRidge(),"BayesianRidge"],
              [ARDRegression(),"ARDRegression"],
              [PassiveAggressiveRegressor(),"PassiveAggressiveRegressor"],
              [TheilSenRegressor(),"TheilSenRegressor"]]
In [17]:
model_results = []
stock_dfs = []
for stock in stock_list:
    print("Stock: ", stock)
    allData = getStockData(stock)
    mlData = FormatDataForModel(allData)
    X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
    df_stocks = forecastData[['Adj. Close']]
    df_stocks = df_stocks.rename(columns={'Adj. Close':stock+' Actual'})
    for model,name in model_list:
        accuracy,prediction=TrainAndPredict(model,X,y,X_data)
        print("Model: ",name , "  ","Accuracy:", accuracy)
        model_results.append((name,stock,accuracy))
        df_stocks[name] = prediction[:]
    stock_dfs.append((stock,df_stocks))
Stock:  AAPL
Model:  LinearRegression    Accuracy: 0.9880007028066012
Model:  SupportVectorRegression    Accuracy: 0.9773271973158659
Model:  MLPRegressor    Accuracy: 0.9890062609191727
Model:  SGDRegressor    Accuracy: 0.9907668561693616
Model:  BayesianRidge    Accuracy: 0.9909966637796256
Model:  ARDRegression    Accuracy: 0.9890171246202556
Model:  PassiveAggressiveRegressor    Accuracy: 0.9826002727190235
Model:  TheilSenRegressor    Accuracy: 0.989659703896199
Stock:  IBM
Model:  LinearRegression    Accuracy: 0.9904394226536867
Model:  SupportVectorRegression    Accuracy: 0.9782764442527898
Model:  MLPRegressor    Accuracy: 0.9900436963218242
Model:  SGDRegressor    Accuracy: 0.9900236236799528
Model:  BayesianRidge    Accuracy: 0.9908465372008897
Model:  ARDRegression    Accuracy: 0.9907567585295833
Model:  PassiveAggressiveRegressor    Accuracy: 0.9843662766694251
Model:  TheilSenRegressor    Accuracy: 0.9902513414250712
Stock:  MSFT
Model:  LinearRegression    Accuracy: 0.979514795592019
Model:  SupportVectorRegression    Accuracy: 0.9701888229394415
Model:  MLPRegressor    Accuracy: 0.9817909694835244
Model:  SGDRegressor    Accuracy: 0.9802413154927502
Model:  BayesianRidge    Accuracy: 0.9797352069751335
Model:  ARDRegression    Accuracy: 0.9818460092740748
Model:  PassiveAggressiveRegressor    Accuracy: 0.9770617903826334
Model:  TheilSenRegressor    Accuracy: 0.9818859776481084
Stock:  WMT
Model:  LinearRegression    Accuracy: 0.9929697011026284
Model:  SupportVectorRegression    Accuracy: 0.9859360602799189
Model:  MLPRegressor    Accuracy: 0.9928422814882302
Model:  SGDRegressor    Accuracy: 0.9922634825758246
Model:  BayesianRidge    Accuracy: 0.9924492063428494
Model:  ARDRegression    Accuracy: 0.9913767556304232
Model:  PassiveAggressiveRegressor    Accuracy: 0.9657854499819
Model:  TheilSenRegressor    Accuracy: 0.9922444876485393
Stock:  AMZN
Model:  LinearRegression    Accuracy: 0.9858675464463936
Model:  SupportVectorRegression    Accuracy: 0.8179698596351699
Model:  MLPRegressor    Accuracy: 0.984053629847749
Model:  SGDRegressor    Accuracy: 0.9838001321048168
Model:  BayesianRidge    Accuracy: 0.9840217236513057
Model:  ARDRegression    Accuracy: 0.984642988344154
Model:  PassiveAggressiveRegressor    Accuracy: 0.9832012224854058
Model:  TheilSenRegressor    Accuracy: 0.9870771738071662
Stock:  TSLA
Model:  LinearRegression    Accuracy: 0.9383533100662305
Model:  SupportVectorRegression    Accuracy: 0.8839002536157954
Model:  MLPRegressor    Accuracy: 0.8887510728909285
Model:  SGDRegressor    Accuracy: 0.9265340562519799
Model:  BayesianRidge    Accuracy: 0.9354629248103332
Model:  ARDRegression    Accuracy: 0.9222201847082739
Model:  PassiveAggressiveRegressor    Accuracy: 0.9222795573958589
Model:  TheilSenRegressor    Accuracy: 0.927162652224615
Stock:  PLUG
Model:  LinearRegression    Accuracy: 0.6663334219315424
Model:  SupportVectorRegression    Accuracy: 0.17811597583035832
Model:  MLPRegressor    Accuracy: 0.7242600833910804
Model:  SGDRegressor    Accuracy: 0.7026028769853929
Model:  BayesianRidge    Accuracy: 0.6866974086635708
Model:  ARDRegression    Accuracy: 0.7085936070396494
Model:  PassiveAggressiveRegressor    Accuracy: 0.6955645182508845
Model:  TheilSenRegressor    Accuracy: 0.7590890934967135
Stock:  GOOGL
Model:  LinearRegression    Accuracy: 0.9768958216618981
Model:  SupportVectorRegression    Accuracy: 0.8107320773052518
Model:  MLPRegressor    Accuracy: 0.9027073538292942
Model:  SGDRegressor    Accuracy: 0.9745954638532988
Model:  BayesianRidge    Accuracy: 0.9728909940552097
Model:  ARDRegression    Accuracy: 0.9732623608916389
Model:  PassiveAggressiveRegressor    Accuracy: 0.9696294931778203
Model:  TheilSenRegressor    Accuracy: 0.9754108113150521
Stock:  FB
Model:  LinearRegression    Accuracy: 0.9640535896757296
Model:  SupportVectorRegression    Accuracy: 0.8835144060175042
Model:  MLPRegressor    Accuracy: 0.6876829102293779
Model:  SGDRegressor    Accuracy: 0.9668526704403181
Model:  BayesianRidge    Accuracy: 0.973361400387332
Model:  ARDRegression    Accuracy: 0.9644340917981894
Model:  PassiveAggressiveRegressor    Accuracy: 0.9659090253955337
Model:  TheilSenRegressor    Accuracy: 0.9652221372234495
Stock:  CRM
Model:  LinearRegression    Accuracy: 0.9746124825811431
Model:  SupportVectorRegression    Accuracy: 0.9498138702106361
Model:  MLPRegressor    Accuracy: 0.9759428730119094
Model:  SGDRegressor    Accuracy: 0.9772471650717187
Model:  BayesianRidge    Accuracy: 0.9756020610503052
Model:  ARDRegression    Accuracy: 0.9780492835789717
Model:  PassiveAggressiveRegressor    Accuracy: 0.9618508667708663
Model:  TheilSenRegressor    Accuracy: 0.9761963994464137
In [18]:
model_names = []
for model,name in model_list:
    model_names.append(name)
df = pd.DataFrame(columns=stock_list,index=model_names)
for i in model_results:
    df.at[i[0],i[1]] = i[2]
df
Out[18]:
AAPL IBM MSFT WMT AMZN TSLA PLUG GOOGL FB CRM
LinearRegression 0.988001 0.990439 0.979515 0.99297 0.985868 0.938353 0.666333 0.976896 0.964054 0.974612
SupportVectorRegression 0.977327 0.978276 0.970189 0.985936 0.81797 0.8839 0.178116 0.810732 0.883514 0.949814
MLPRegressor 0.989006 0.990044 0.981791 0.992842 0.984054 0.888751 0.72426 0.902707 0.687683 0.975943
SGDRegressor 0.990767 0.990024 0.980241 0.992263 0.9838 0.926534 0.702603 0.974595 0.966853 0.977247
BayesianRidge 0.990997 0.990847 0.979735 0.992449 0.984022 0.935463 0.686697 0.972891 0.973361 0.975602
ARDRegression 0.989017 0.990757 0.981846 0.991377 0.984643 0.92222 0.708594 0.973262 0.964434 0.978049
PassiveAggressiveRegressor 0.9826 0.984366 0.977062 0.965785 0.983201 0.92228 0.695565 0.969629 0.965909 0.961851
TheilSenRegressor 0.98966 0.990251 0.981886 0.992244 0.987077 0.927163 0.759089 0.975411 0.965222 0.976196
In [19]:
highest = []
for i in df.columns:
    highest.append([i, df[i].astype(float).idxmax(), df[i].max()])
df_high = pd.DataFrame(highest, columns=["Stock","Model","Accuracy"])
df_high
Out[19]:
Stock Model Accuracy
0 AAPL BayesianRidge 0.990997
1 IBM BayesianRidge 0.990847
2 MSFT TheilSenRegressor 0.981886
3 WMT LinearRegression 0.992970
4 AMZN TheilSenRegressor 0.987077
5 TSLA LinearRegression 0.938353
6 PLUG TheilSenRegressor 0.759089
7 GOOGL LinearRegression 0.976896
8 FB BayesianRidge 0.973361
9 CRM ARDRegression 0.978049
In [20]:
average = df.mean(axis=1)
In [21]:
average.sort_values(ascending=False)
Out[21]:
TheilSenRegressor             0.954420
SGDRegressor                  0.948493
ARDRegression                 0.948420
BayesianRidge                 0.948206
LinearRegression              0.945704
PassiveAggressiveRegressor    0.940825
MLPRegressor                  0.911708
SupportVectorRegression       0.843577
dtype: float64
In [22]:
for stock,stock_df in stock_dfs:
    fig = px.line(stock_df)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()